##  reverseformat.pl
##
##  This program converts post-SVM fully processed news stories back to the pre-SVM processing stage.
##  The output generated by this program looks exactly like the output generated by the "Step One Text Formatting"
##  filters.  These stories can then be re-run through SVM to produce a new or additional learning model.
##
##   Redistribution and use in source and binary forms, with or without modification,
##   are permitted under the terms of the GNU General Public License:
##   http://www.opensource.org/licenses/gpl-license.html
##
##  Report bugs to: vjd125@psu.edu
##
##  For plausible indenting of this source code, set the tab size in your editor to "2"
##
##  REVISION HISTORY:
##  Original code base appears to have been one or more of the various Lexis-Nexis filter
##  maintained by the KEDS project, http://eventdata.psu.edu
##  Sep-08:  Initial MID-SVM version for 1999 and 2001 downloads by Vladimir Petroff, University of Kansas
##  Jun-10:  Current program adapted from genf.pl by Vito D'Orazio, Pennsylvania State University


#!/usr/local/bin/perl

# ======== globals =========== #


$file_list  =  "./files.list"; #     # list of files to be processed
$outfile    = 'nnfoutfile.txt';   # output file; this will be renamed

%month_number  = (  # hash used to translate dates
 Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06',
 Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12',
);

$skip='skip'; #file contains what we are skipping from input
$filt=0; #how many lines we are filtering
$storyN=0; #not stories initially in outputfile, counts how many per input file
$kfile=0; #number of input files processed
$body=""; #text between a DOCS and LANGUAGE tags

# ======== subroutines =========== #

sub Filter {
#  Takes a body between lines of dashes and reverse-formats the body back to the pre-SVM processing phase.
#  This enables the user to re-run "Step Two: Generating SVM Input" files in order to update the learning model.

  $filtfl=1; #filter flag, 0 not filter, 1 filter
  $head=$nhl; #deafult headline not found
  $firsttext = "0";
  $grabHL=10000;
  $grabBody="0";
  $source = "No Source";
  $key = "No Key";



  my @values = split(/\n/,$_[0]);  # note that this ignores consecutive \n...nice...
  for($ka=0; $ka < @values; $ka++) {

    $val = $values[$ka];

    # Skipping the SVM score and identifying when to grab the body
    if ($val =~ m/SVM/) {
      $grabBody = "1";
      next;
    }

    # Skipping blank lines
    if ($val =~ m/^$/) {next:}

    # The FIRST LINE OF TEXT is the KEY
    if (($val =~ m/\w+/) && ($firsttext=="0")) {
       $firsttext = "1";
       $key = $val;
       $grabHL = $ka+1;
       next;
    }

    # HEADLINE is the first line after the KEY

    if($grabHL eq $ka){
      $head = $val;
      next;
    }


    # Checking if the current line matches the DATE format
    if (($val =~ m/(\w+) (\d+), (\d\d\d\d)/)) {
      $date = $val;
      next;
    }


    # Establishing the News Source
    if ($val =~ m/News source/) {
      $source = substr $val,13; #this needs to be changes to only assign characters after "News Source:"
    }

    # Establishing the body of the text
    if ($grabBody == "1") {
         #add to body only things we want there
         $text.=$val."\n"
      } #gets body

  } #for each loop going over lines in a body

  ++$seqno;

  ## FORMATTING AND PRINTING TO FILE RELEVANT INFORMATION ##

  $head =~ s/^\s+//; #remove leading whites
  $source =~ s/^\s+//; #remove leading whites
  $date =~ s/^\s+//; #remove leading whites
  $key =~ s/^\s+//; #remove leading whites

  chomp($text);
  print "Printing story...newdate-$srcid-$seqno-$filename\n";
  print FOUT "---------------------------------------------------------------\n";
  print FOUT "$head\n";
  print FOUT "$key\n";
  print FOUT "$date\n";
  print FOUT "$source\n";   #"--$storyN--$filtfl"."\n";
  print FOUT "\n";
  print FOUT "$text\n\n";

  $storyN++; #count how many stories processed
  $text=""; # re-initialize

}


# ======== main program =========== #

$fileprfx = $ARGV[0];  # set final file name prefix
if (length($fileprfx) == 0) {
  print "\aFile name prefix is a required argument! -- please re-run program\n";
  exit;
}


#open input, output, skip file
open (FSKIP, ">$fileprfx.$skip")   or die "Can\'t open skipped file $skip; error $!";
open(FDIR,$file_list)         or die "Can\'t open list of input files $file_list; error $!";
open (FOUT, ">$outfile")   or die "Can\'t open output file $outfile; error $!";

$kf = 0;
DIRECTORY: while ($filename = <FDIR>) {
   print "\n==========================\nProcessing $filename\n";

  $seqno = '0001';
  chomp($filename);
  open(INFILE,"<$filename")  or die "Can\'t open input file $fname; error $!";
  $line = <INFILE>;
  $ke = 0;
  FILE: while (!eof) {

  $line =~ s/^\s+//; #remove leading whites

  if ($line !~ m/-------------------/) {$line = <INFILE>;}
  else {  #found the next story, extract body
   if (eof) {last;} #check
                     #once we get a doc tag, get the body of the article
   STORY: while ($line = <INFILE>) {
     unless ($line =~ m/-----------------/)  {
       $body.=$line;
     }
     else {last;}
   }
  #take the body and reformat
  #output only relevant information
  Filter($body);
  print "Looping...$kfile files, $storyN stories\n";
  $body="";

  } #end else meaning found the story

} #while individual file

$kfile++; #count how many input files

} #while file list loop

rename($outfile, "reversed.0") or die "Can\'t rename output file $outfile; error $!";

close(FOUT);
close(FSKIP);
close(FDIR);

print "\n\aFinished: $kfile files processed, $storyN stories.\n";
exit;
